package com.muchong.spider;

import java.util.Set;

import javax.sql.DataSource;
import java.sql.SQLException;

import org.apache.log4j.Logger;
import org.openqa.selenium.By;
import org.openqa.selenium.Cookie;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.chrome.ChromeDriver;

import com.muchong.entity.MuUser;
import com.muchong.util.LoginUtil;
import com.xiaoleilu.hutool.db.Entity;
import com.xiaoleilu.hutool.db.SqlRunner;
import com.xiaoleilu.hutool.db.ds.DSFactory;
import com.xiaoleilu.hutool.util.CollectionUtil;

import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Html;

import com.xiaoleilu.hutool.setting.dialect.Props;

/**
 * 
 * @author Mark
 * 小木虫网站地址:http://muchong.com/bbs/index.php
 *
 */
public class GetUser implements PageProcessor {

	static Logger logger = Logger.getLogger(GetUser.class);
	static Site site = Site.me().setRetryTimes(3).setSleepTime(2000).setTimeOut(3000);
	private Set<Cookie> cookies;

	static DataSource ds = DSFactory.get();
	static SqlRunner sqlrunner = SqlRunner.create(ds);
	static String id;

	public void process(Page page) {

		// 获取页面的html
		Html html = page.getHtml();

		String userName = html.xpath("//title/text()").regex(".*? - 用户 -").toString().replaceAll("- 用户 -", "");
		logger.error("用户名:" + userName);

		// 听众
		String listen = html.regex("听众:.*?&nbsp;").regex("[0-9]+").toString();
		// System.out.println("听众:"+listen);
		logger.error("听众:" + listen);

		// 获取注册时间
		String registerTime = html.xpath("//table[@class='userinfo base']/tbody/tr/td[1]/text()").toString();
		// System.out.println("注册时间" + registerTime);
		logger.error("注册时间" + registerTime);
		// mu.setRegister(DateUtil.parse(registerTime));

		// 获取最后活跃时间
		String lastTime = html.xpath("//table[@class='userinfo base']/tbody/tr/td[2]/text()").toString();
		// System.out.println("最后活跃时间" + lastTime);
		logger.error("最后活跃时间" + lastTime);

		// 获取用户组
		String userTeam = html.xpath("//div[@class='user_index_info']/table[2]/tbody/tr[@class='base']/td[2]/text()")
				.toString();
		// System.out.println("用户组:" + userTeam);
		logger.error("用户组:" + userTeam);

		// 应助
		String help = html.xpath("//div[@class='user_index_info']/table[2]/tbody/tr[@class='base']/td[3]/text()")
				.toString();
		// System.out.println("应助:" + help);
		logger.error("应助:" + help);

		// 获取金币
		String gold = html.xpath("//div[@class='user_index_info']/table[2]/tbody/tr[2]/td[2]/text()").toString();
		// System.out.println("金币:" + gold);
		logger.error("金币:" + gold);

		// 散金币
		String giveGold = html.xpath("//div[@class='user_index_info']/table[2]/tbody/tr[2]/td[3]/text()").toString();
		// System.out.println("散金币:" + giveGold);
		logger.error("散金币:" + giveGold);

		// 沙发
		String saf = html.xpath("//div[@class='user_index_info']/table[2]/tbody/tr[3]/td[1]/text()").toString();
		// System.out.println("沙发:" + saf);
		logger.error("沙发:" + saf);

		// 帖子
		String topic = html.xpath("//div[@class='user_index_info']/table[2]/tbody/tr[3]/td[2]/text()").toString();
		// System.out.println("发帖:" + topic);
		logger.error("发帖:" + topic);

		// 在线时间
		String onlineTime = html.xpath("//div[@class='user_index_info']/table[2]/tbody/tr[4]/td[1]/text()")
				.regex("[0-9-.]+").toString();
		// System.out.println("在线时间:" + onlineTime+"小时");
		logger.error("在线时间:" + onlineTime + "小时");

		// 专业
		String subject = html.xpath("//div[@class='user_index_info']/table[2]/tbody/tr[4]/td[3]/text()").toString();
		// System.out.println("专业:" + subject);
		logger.error("专业:" + subject);

		// 性别
		String sex = html.xpath("//div[@class='user_index_info']/table[2]/tbody/tr[5]/td[1]/font/text()").toString();
		// System.out.println("性别:" + sex);
		logger.error("性别:" + sex);

		// 来自
		String from = html.xpath("//div[@class='user_index_info']/table[2]/tbody/tr[5]/td[2]/text()").toString();
		// System.out.println("来自:" + from);
		logger.error("来自:" + from);

		// 生日
		String birth = html.xpath("//div[@class='user_index_info']/table[2]/tbody/tr[5]/td[3]/text()").toString();
		// System.out.println("生日:" + birth);
		logger.error("生日:" + birth);

		// 红花
		String red = html.xpath("//div[@style='padding:10px;font-weight:bold;']/a/text()").regex("[0-9]+").toString();
		// System.out.println("红花:" + red);
		logger.error("红花:" + red);

		String head = html.xpath("//td[@class='space_avatar']/img/@src").toString();
		// System.out.println("头像:" + head);
		logger.error("头像:" + head);
		MuUser mu = new MuUser();
		mu.setBirth(birth);
		mu.setFrom(from);
		mu.setGiveGolden(giveGold);
		mu.setGolden(gold);
		mu.setHead(head);
		mu.setHelp(help);
		mu.setId(id);
		mu.setLastLive(lastTime);
		mu.setListen(listen);
		mu.setOnline(onlineTime);
		mu.setRedFlower(red);
		mu.setRegister(registerTime);
		mu.setSafa(saf);
		mu.setSex(sex);
		mu.setSubject(subject);
		mu.setTopic(topic);
		mu.setUsername(userName);
		mu.setUserTeam(userTeam);
		System.out.println("持久化----" + mu.toString() + "---sucess");
		logger.error("持久化----" + mu.toString() + "---sucess");
		try {
			sqlrunner.insert(CollectionUtil.newArrayList(Entity.parse(mu)));
		} catch (SQLException e) {
			e.printStackTrace();
		}

	}

	/**
	 * 使用谷歌浏览器驱动登录获取cookies
	 * 
	 * @throws InterruptedException
	 */
	public void login() throws InterruptedException {
		Props props = new Props("user.properties");
		System.setProperty("webdriver.chrome.driver", props.getStr("path"));
		WebDriver driver = new ChromeDriver();
		driver.get("http://muchong.com/bbs/logging.php?action=login");

		driver.findElement(By.name("username")).sendKeys(props.getStr("user"));
		driver.findElement(By.xpath("//input[@tabindex='2']")).sendKeys(props.getStr("pwd"));
		driver.findElement(By.xpath("//input[@value='会员登录']")).click();

		// 获得计算方法
		String pr = driver.findElement(By.xpath("//div[@style='padding:10px 0;']")).getText();
		System.out.println(pr);
		String result = LoginUtil.getResult(pr);
		System.out.println(result);
		driver.findElement(By.name("post_sec_code")).sendKeys(result);
		driver.findElement(By.xpath("//input[@class='xmc_Big_btn reg_button1']")).click();
		System.out.println("登录成功！");
		cookies = driver.manage().getCookies();
		driver.close();

	}

	/**
	 * 将cookie和浏览器头放入webmagic
	 */
	public Site getSite() {
		for (Cookie cookie : cookies) {
			site.addCookie(cookie.getName().toString(), cookie.getValue().toString());
		}

		// 如果使用代理，可以配置代理池
		// List<String[]> poolHosts = new ArrayList<String[]>();
		// poolHosts.add(new
		// String[]{"username","password","49.85.6.98","22474"});
		// poolHosts.add(new
		// String[]{"username","password","123.162.86.47","34252"});
		// site.setHttpProxyPool(poolHosts);

		return site.addHeader("User-Agent",
				"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1");
	}

	public static void main(String[] args) throws InterruptedException {
		long before = System.currentTimeMillis();
		GetUser user = new GetUser();
		user.login();
		int t = 0;
		int n = 0;
		for (int i = 5006000; i < 8500000; i++) {
			id = i + "";
			t += 1;
			if (t == 500) {
				n += 1;
				logger.error("----第  " + n + "次500用户");
				Thread.sleep(1000 * 2400);
				t = 0;
			}
			Spider.create(user).addUrl("http://muchong.com/bbs/space.php?uid=" + i).thread(1).run();
			System.out.println("----正在抓取第  " + i + " 个用户");
			logger.error("----正在抓取第  " + i + " 个用户");
		}
		long after = System.currentTimeMillis();
		long x = (after - before) / 1000;
		logger.error("花费时间:" + x + "秒");
		System.exit(0);

	}
}
